import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from google.cloud import bigquery
from google.oauth2 import service_account
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
key_path="/home/workflow/bigquey_key/eternal-calling-318816-1f09ae068787.json"
credentials = service_account.Credentials.from_service_account_file(
key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)
client = bigquery.Client(credentials=credentials, project=credentials.project_id,)
dataset_ref = client.dataset("google_analytics_sample", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)
tables = list(client.list_tables(dataset))
table_ref = dataset_ref.table("ga_sessions_20170801")
table = client.get_table(table_ref)
query1 = """
SELECT *
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_table_suffix = "20170801"
"""
safe_query_job= client.query(query1)
df = safe_query_job.to_dataframe()
df.columns
Index(['visitorId', 'visitNumber', 'visitId', 'visitStartTime', 'date',
'totals', 'trafficSource', 'device', 'geoNetwork', 'customDimensions',
'hits', 'fullVisitorId', 'userId', 'clientId', 'channelGrouping',
'socialEngagementType'],
dtype='object')
test = pd.DataFrame(df.groupby(['fullVisitorId']).apply(lambda x: len(x) > 1)).reset_index()
visitors = test[test[0] == True]['fullVisitorId'].values
df[df['fullVisitorId'].isin(visitors)].head()
| visitorId | visitNumber | visitId | visitStartTime | date | totals | trafficSource | device | geoNetwork | customDimensions | hits | fullVisitorId | userId | clientId | channelGrouping | socialEngagementType | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | None | 1 | 1501573386 | 1501573386 | 20170801 | {'visits': 1, 'hits': 1, 'pageviews': 1, 'time... | {'referralPath': None, 'campaign': '(not set)'... | {'browser': 'Chrome', 'browserVersion': 'not a... | {'continent': 'Europe', 'subContinent': 'Weste... | [{'index': 4, 'value': 'EMEA'}] | [{'hitNumber': 1, 'time': 0, 'hour': 0, 'minut... | 0750846065342433129 | None | None | Direct | Not Socially Engaged |
| 12 | None | 2 | 1501603594 | 1501603594 | 20170801 | {'visits': 1, 'hits': 1, 'pageviews': 1, 'time... | {'referralPath': '/', 'campaign': '(not set)',... | {'browser': 'Chrome', 'browserVersion': 'not a... | {'continent': 'Americas', 'subContinent': 'Nor... | [{'index': 4, 'value': 'North America'}] | [{'hitNumber': 1, 'time': 0, 'hour': 9, 'minut... | 5485017567268822718 | None | None | Referral | Not Socially Engaged |
| 15 | None | 3 | 1501638744 | 1501638744 | 20170801 | {'visits': 1, 'hits': 1, 'pageviews': 1, 'time... | {'referralPath': None, 'campaign': '(not set)'... | {'browser': 'Chrome', 'browserVersion': 'not a... | {'continent': 'Asia', 'subContinent': 'Souther... | [{'index': 4, 'value': 'APAC'}] | [{'hitNumber': 1, 'time': 0, 'hour': 18, 'minu... | 3239591421883808376 | None | None | Organic Search | Not Socially Engaged |
| 19 | None | 2 | 1501650737 | 1501650737 | 20170801 | {'visits': 1, 'hits': 1, 'pageviews': 1, 'time... | {'referralPath': None, 'campaign': '(not set)'... | {'browser': 'Safari', 'browserVersion': 'not a... | {'continent': 'Americas', 'subContinent': 'Nor... | [{'index': 4, 'value': 'North America'}] | [{'hitNumber': 1, 'time': 0, 'hour': 22, 'minu... | 9115044195942480006 | None | None | Direct | Not Socially Engaged |
| 24 | None | 27 | 1501656919 | 1501656919 | 20170801 | {'visits': 1, 'hits': 1, 'pageviews': 1, 'time... | {'referralPath': None, 'campaign': '(not set)'... | {'browser': 'Chrome', 'browserVersion': 'not a... | {'continent': 'Europe', 'subContinent': 'South... | [{'index': 4, 'value': 'EMEA'}] | [{'hitNumber': 1, 'time': 0, 'hour': 23, 'minu... | 3884810646891698298 | None | None | Direct | Not Socially Engaged |
total_unique_users = """
SELECT COUNT(DISTINCT fullVisitorId) as uniqueUsersQuant
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE _table_suffix = "20170801"
"""
safe_query_job= client.query(total_unique_users)
query_df= safe_query_job.to_dataframe()
query_df
| uniqueUsersQuant | |
|---|---|
| 0 | 2293 |
# count users who visited more than 1 time
query2 = """
SELECT COUNT(manyVisitsUsers) as totalQuantity
FROM
(
SELECT COUNT(*) as manyVisitsUsers
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE _table_suffix = "20170801"
GROUP BY fullVisitorId
HAVING COUNT(*) > 1
) a
"""
safe_query_job= client.query(query2)
query_df1= safe_query_job.to_dataframe()
query_df1
| totalQuantity | |
|---|---|
| 0 | 216 |
# find most visited country from
query3 ="""
SELECT COUNT(DISTINCT fullVisitorId) as total_customers, geoNetwork.country as country
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE _table_suffix = "20170801"
GROUP BY geoNetwork.country
ORDER BY total_customers DESC
"""
safe_query_job= client.query(query3)
query_df3= safe_query_job.to_dataframe()
geo = px.data.gapminder()
geo = geo.drop_duplicates(subset=['country'])
query_df3 = pd.merge(query_df3, geo[['country','iso_alpha']], on =['country'], how='inner')
fig = px.scatter_geo(query_df3,locations="iso_alpha",
size="total_customers",
color="country",
title="Customers distribution by country",
width = 1100,
height = 500,
)
fig.show()
Lets Examine Google USA merchandise market
# revenue by state
query4 ="""
SELECT SUM(hits.transaction.transactionRevenue / 1000000) AS revenueTotal, geoNetwork.region AS state
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST (hits) AS hits
WHERE _table_suffix = "20170801"
AND geoNetwork.country = 'United States'
GROUP BY geoNetwork.region
ORDER BY revenueTotal DESC
LIMIT 6
"""
safe_query_job= client.query(query4)
query_df4= safe_query_job.to_dataframe()
query_df4
| revenueTotal | state | |
|---|---|---|
| 0 | 3670.02 | California |
| 1 | 1723.23 | New York |
| 2 | 1448.89 | not available in demo dataset |
| 3 | 1442.46 | Illinois |
| 4 | 238.53 | Georgia |
| 5 | 137.07 | Texas |
fig = px.bar(query_df4, x='state', y='revenueTotal',
labels={'':'Revenue By State'}, height=400)
fig.show()
# revenue by state
query5 ="""
SELECT SUM(hits.transaction.transactionRevenue / 1000000) / COUNT(DISTINCT fullVisitorId) AS revenueByCustomer, geoNetwork.region AS state
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST (hits) AS hits
WHERE _table_suffix = "20170801"
AND geoNetwork.country = 'United States'
GROUP BY geoNetwork.region
ORDER BY revenueByCustomer DESC
LIMIT 6
"""
safe_query_job = client.query(query5)
query_df5 = safe_query_job.to_dataframe()
query_df5
| revenueByCustomer | state | |
|---|---|---|
| 0 | 62.715652 | Illinois |
| 1 | 23.853000 | Georgia |
| 2 | 20.514643 | New York |
| 3 | 10.576427 | California |
| 4 | 5.643333 | Massachusetts |
| 5 | 4.684286 | Michigan |
fig = px.bar(query_df5, x='state', y='revenueByCustomer',
labels={'':'Revenue By Customer in USA'}, height=400)
fig.show()
Lets find out what are most popular items in Illinois and California
query6 ="""
SELECT COUNT(items.v2ProductCategory )AS categoryPurchases, v2ProductCategory, SUM(hits.transaction.transactionRevenue / 1000000) AS revenueTotalByCategory
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST (hits) AS hits,
UNNEST(hits.product) AS items
WHERE _table_suffix = "20170801"
AND geoNetwork.country = 'United States' AND geoNetwork.region = 'Illinois'
GROUP BY items.v2ProductCategory
ORDER BY revenueTotalByCategory DESC, 1
LIMIT 10
"""
safe_query_job = client.query(query6)
query_df6 = safe_query_job.to_dataframe()
query_df6
| categoryPurchases | v2ProductCategory | revenueTotalByCategory | |
|---|---|---|---|
| 0 | 87 | Apparel | 5443.79 |
| 1 | 22 | Bags | 1534.29 |
| 2 | 5 | Headgear | 985.73 |
| 3 | 5 | Electronics | 985.73 |
| 4 | 5 | Drinkware | 985.73 |
| 5 | 7 | Notebooks & Journals | 985.73 |
| 6 | 9 | Office | 985.73 |
| 7 | 9 | ${productitem.product.origCatName} | 274.28 |
| 8 | 2 | Home/Electronics/Audio/ | NaN |
| 9 | 2 | Waze | NaN |
query7 ="""
SELECT COUNT(items.v2ProductCategory )AS categoryPurchases, v2ProductCategory, SUM(hits.transaction.transactionRevenue / 1000000) AS revenueTotalByCategory
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST (hits) AS hits,
UNNEST(hits.product) AS items
WHERE _table_suffix = "20170801"
AND geoNetwork.country = 'United States' AND geoNetwork.region = 'California'
GROUP BY items.v2ProductCategory
ORDER BY revenueTotalByCategory DESC, 1
LIMIT 10
"""
safe_query_job = client.query(query7)
query_df7 = safe_query_job.to_dataframe()
query_df7
| categoryPurchases | v2ProductCategory | revenueTotalByCategory | |
|---|---|---|---|
| 0 | 36 | Notebooks & Journals | 5911.51 |
| 1 | 30 | Drinkware | 3196.43 |
| 2 | 21 | Headgear | 3163.67 |
| 3 | 58 | Office | 3043.45 |
| 4 | 294 | Apparel | 2557.64 |
| 5 | 7 | Bags | 178.39 |
| 6 | 13 | Electronics | 82.43 |
| 7 | 5 | Fun | 32.76 |
| 8 | 8 | Home/Gift Cards/ | NaN |
| 9 | 34 | Lifestyle | NaN |
query9 ="""
SELECT*, RANK() OVER(PARTITION BY region ORDER BY ProductDetailViews DESC) Rank
FROM
(
SELECT geoNetwork.region, SUM(CASE WHEN hits.eCommerceAction.action_type = "2" THEN 1 END) AS ProductDetailViews, items.v2ProductName AS product_name,
FROM
(
SELECT*
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE _table_suffix = "20170801" AND geoNetwork.country = 'United States'
),
UNNEST (hits) AS hits,
UNNEST(hits.product) AS items
GROUP BY product_name, geoNetwork.region
)
"""
safe_query_job = client.query(query9)
query_df9 = safe_query_job.to_dataframe()
query_df9.sort_values(by=['Rank'],ascending=True).head(20)
| region | ProductDetailViews | product_name | Rank | |
|---|---|---|---|---|
| 0 | Washington | 6.0 | Google Women's Performance Full Zip Jacket Black | 1 |
| 388 | Texas | 3.0 | Red Shine 15 oz Mug | 1 |
| 559 | Massachusetts | 6.0 | Google Men's Zip Hoodie | 1 |
| 742 | New York | 7.0 | Google Men's 100% Cotton Short Sleeve Hero Tee... | 1 |
| 1551 | California | 18.0 | Google Rucksack | 1 |
| 953 | North Carolina | 4.0 | Android Men's Engineer Short Sleeve Tee Charcoal | 1 |
| 188 | Michigan | 3.0 | Android Sticker Sheet Ultra Removable | 1 |
| 1007 | (not set) | 1.0 | Google Vintage Henley Grey/Black | 1 |
| 1046 | Pennsylvania | 1.0 | Google Alpine Style Backpack | 1 |
| 1047 | Pennsylvania | 1.0 | Android Hard Cover Journal | 1 |
| 1092 | not available in demo dataset | 11.0 | YouTube Men's Short Sleeve Hero Tee Black | 1 |
| 1388 | Illinois | 5.0 | Google Sunglasses | 1 |
| 1316 | District of Columbia | 1.0 | Google Sunglasses | 1 |
| 1317 | District of Columbia | 1.0 | Google Women's Convertible Vest-Jacket Sea Foa... | 1 |
| 291 | Georgia | 4.0 | Android Sticker Sheet Ultra Removable | 1 |
| 1044 | (not set) | NaN | YouTube Women's Short Sleeve Tri-blend Badge T... | 2 |
| 1020 | (not set) | NaN | Android Twill Cap | 2 |
| 1019 | (not set) | NaN | Android Wool Heather Cap Heather/Black | 2 |
| 1018 | (not set) | NaN | Android Men's Long & Lean Badge Tee Charcoal | 2 |
| 1017 | (not set) | NaN | Android Men's Short Sleeve Hero Tee White | 2 |
query10 ="""
SELECT*, RANK() OVER(PARTITION BY region ORDER BY Transactions DESC) Rank
FROM
(
SELECT geoNetwork.region, SUM(CASE WHEN hits.eCommerceAction.action_type = "6" THEN 1 END) AS Transactions, items.v2ProductName AS product_name,items.v2ProductCategory
FROM
(
SELECT*
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE _table_suffix = "20170801" AND geoNetwork.country = 'United States'
),
UNNEST (hits) AS hits,
UNNEST(hits.product) AS items
GROUP BY product_name, geoNetwork.region, items.v2ProductCategory
)
"""
safe_query_job = client.query(query10)
query_df10 = safe_query_job.to_dataframe()
query_df10[~query_df10['Transactions'].isna()].sort_values(by=['Rank'],ascending=True).head(30)
| region | Transactions | product_name | v2ProductCategory | Rank | |
|---|---|---|---|---|---|
| 100 | California | 4.0 | Google Women's Vintage Hero Tee Black | Apparel | 1 |
| 791 | Michigan | 2.0 | Android Sticker Sheet Ultra Removable | Office | 1 |
| 925 | not available in demo dataset | 4.0 | Google Blackout Cap | Headgear | 1 |
| 926 | not available in demo dataset | 4.0 | Google Power Bank | Electronics | 1 |
| 1588 | Illinois | 6.0 | Google Tri-blend Hoodie Grey | Apparel | 1 |
| 3161 | Washington | 2.0 | Google Tote Bag | Bags | 1 |
| 1864 | Georgia | 8.0 | Google Sunglasses | Lifestyle | 1 |
| 1989 | District of Columbia | 4.0 | Google Sunglasses | Lifestyle | 1 |
| 790 | Michigan | 2.0 | Android Men's Long Sleeve Badge Crew Tee Heather | Apparel | 1 |
| 2074 | New York | 6.0 | Google Men's 100% Cotton Short Sleeve Hero Tee... | Apparel | 1 |
| 2578 | Texas | 12.0 | Google Women's Vintage Hero Tee Black | Apparel | 1 |
| 2878 | Massachusetts | 2.0 | Google Men's Zip Hoodie | Apparel | 1 |
| 3156 | Washington | 2.0 | Android Lunch Kit | Lifestyle | 1 |
| 3157 | Washington | 2.0 | Android 17oz Stainless Steel Sport Bottle | Drinkware | 1 |
| 3158 | Washington | 2.0 | Google 5-Panel Snapback Cap | Headgear | 1 |
| 3159 | Washington | 2.0 | Google Kick Ball | Lifestyle | 1 |
| 3160 | Washington | 2.0 | Google Sunglasses | Lifestyle | 1 |
| 2075 | New York | 6.0 | Google Men's 100% Cotton Short Sleeve Hero Tee... | Apparel | 1 |
| 789 | Michigan | 2.0 | Google Women's Scoop Neck Tee Black | Apparel | 1 |
| 3162 | Washington | 2.0 | Women's Performance Full Zip Jacket Black | Apparel | 1 |
| 107 | California | 4.0 | Google Blackout Cap | Headgear | 1 |
| 101 | California | 4.0 | Android Men's Long Sleeve Badge Crew Tee Heather | Apparel | 1 |
| 102 | California | 4.0 | Google Men's Watershed Full Zip Hoodie Grey | Apparel | 1 |
| 103 | California | 4.0 | Android Men's Short Sleeve Hero Tee White | Apparel | 1 |
| 104 | California | 4.0 | Google Men's Bayside Graphic Tee | Apparel | 1 |
| 105 | California | 4.0 | Google Men's 100% Cotton Short Sleeve Hero Tee... | Apparel | 1 |
| 106 | California | 4.0 | Google 22 oz Water Bottle | Drinkware | 1 |
| 1868 | Georgia | 2.0 | Google Leather Perforated Journal | Notebooks & Journals | 2 |
| 1869 | Georgia | 2.0 | Google Men's Long & Lean Tee Grey | Apparel | 2 |
| 2579 | Texas | 8.0 | Google Men's Long & Lean Tee Charcoal | Apparel | 2 |
query_df10[(query_df10['region'] == 'California') & (query_df10['v2ProductCategory'] == 'Drinkware')]
| region | Transactions | product_name | v2ProductCategory | Rank | |
|---|---|---|---|---|---|
| 106 | California | 4.0 | Google 22 oz Water Bottle | Drinkware | 1 |
| 115 | California | 2.0 | Android 17oz Stainless Steel Sport Bottle | Drinkware | 9 |
| 135 | California | 2.0 | Google 17oz Stainless Steel Sport Bottle | Drinkware | 9 |
| 631 | California | NaN | Foam Can and Bottle Cooler | Drinkware | 42 |
| 771 | California | NaN | Android Rise 14 oz Mug | Drinkware | 42 |
query_df10[(query_df10['region'] == 'California') & (query_df10['v2ProductCategory'] == 'Notebooks & Journals')]
| region | Transactions | product_name | v2ProductCategory | Rank | |
|---|---|---|---|---|---|
| 108 | California | 2.0 | Google RFID Journal | Notebooks & Journals | 9 |
| 123 | California | 2.0 | Google Leather Journal-Black | Notebooks & Journals | 9 |
| 126 | California | 2.0 | Google Spiral Journal with Pen | Notebooks & Journals | 9 |
| 583 | California | NaN | Google Spiral Leather Journal | Notebooks & Journals | 42 |
| 717 | California | NaN | Google Leather Perforated Journal | Notebooks & Journals | 42 |
query11 ="""
SELECT SUM(hits.transaction.transactionRevenue / 1000000) AS revenueByCustomer, fullVisitorId
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST (hits) AS hits
WHERE _table_suffix = "20170801"
AND geoNetwork.country = 'United States' AND geoNetwork.region = 'Illinois'
GROUP BY fullVisitorId
ORDER BY revenueByCustomer DESC, 1
LIMIT 10
"""
safe_query_job = client.query(query11)
query_df11 = safe_query_job.to_dataframe()
query_df11
| revenueByCustomer | fullVisitorId | |
|---|---|---|
| 0 | 985.73 | 7311242886083854158 |
| 1 | 404.96 | 5772921781547143127 |
| 2 | 51.77 | 0815047945706399620 |
| 3 | NaN | 8345556086864169755 |
| 4 | NaN | 2688806293820616307 |
| 5 | NaN | 502979622235679017 |
| 6 | NaN | 4649229463376467903 |
| 7 | NaN | 9176912249608646871 |
| 8 | NaN | 0004915997121163857 |
| 9 | NaN | 1603126987847875573 |
sns.boxplot(x=query_df11["revenueByCustomer"])
<AxesSubplot:xlabel='revenueByCustomer'>
query12 ="""
SELECT SUM(hits.transaction.transactionRevenue / 1000000) AS revenueByCustomer, fullVisitorId
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST (hits) AS hits
WHERE _table_suffix = "20170801"
AND geoNetwork.country = 'United States' AND geoNetwork.region = 'California'
GROUP BY fullVisitorId
ORDER BY revenueByCustomer DESC, 1
LIMIT 10
"""
safe_query_job = client.query(query12)
query_df12 = safe_query_job.to_dataframe()
query_df12
| revenueByCustomer | fullVisitorId | |
|---|---|---|
| 0 | 2935.61 | 9308310352918219134 |
| 1 | 178.39 | 9008243837280281377 |
| 2 | 139.96 | 8016003971239765913 |
| 3 | 58.00 | 698297406423033664 |
| 4 | 54.54 | 8841197329105123740 |
| 5 | 49.67 | 8730493892218660083 |
| 6 | 46.00 | 0377506370870345639 |
| 7 | 40.29 | 0509972280802528263 |
| 8 | 35.58 | 9800325346437826374 |
| 9 | 34.79 | 0188308435688462855 |
sns.boxplot(x=query_df12["revenueByCustomer"])
# Customer outlier in california
<AxesSubplot:xlabel='revenueByCustomer'>
query_df12[query_df12['fullVisitorId'] == '9308310352918219134']
| revenueByCustomer | fullVisitorId | |
|---|---|---|
| 0 | 2935.61 | 9308310352918219134 |